Code to estimate signal and noise on daily basis for SOFIA¶

  • Computes the MLE of the state and the noise term for each daily observation using both RBA replicated version of base SOFIA and the ASX published version
  • Sample goes from 2022-01-04 to 2025-03-19 with four dates where SOFIA is not computed directly due to lack of transactions

Authors: James Brugler Calebe De Roure Marta Khomyn Max Prakoso Tails Putnins

In [1]:
# Load requried packages and print versions
import sys
import pandas as pd
import numpy as np
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
import datetime as dt
import multiprocessing as mp
import warnings
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
import matplotlib.lines as mlines
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore', category=RuntimeWarning)
In [2]:
print('Python version: ', sys.version) # dt and mp are part of standard python library
print('Pandas version: ', pd.__version__)
print('Numpy version: ', np.__version__)
print('Statsmodels version: ', sm.__version__)

pathdata = 'Data/'
Python version:  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
Pandas version:  1.5.2
Numpy version:  1.21.5
Statsmodels version:  0.13.2
In [3]:
# Load the cash-rate data 
dfcash = pd.read_excel(pathdata + 'f01d.xlsx',
                        skiprows = 10)

dfcash.columns = [x.lower() for x in dfcash.columns]

dictcolsrba = {'series id' : 'date',
               'firmmcrtd' : 'onr',
               'firmmcrid' : 'aonia'}

dfcash = dfcash[dictcolsrba.keys()]
dfcash.columns = [dictcolsrba[x] for x in dictcolsrba.keys()]
In [4]:
# Load the ASX data
dfasx = pd.read_excel(pathdata + 'sofia-beta-version_april_2024_corected.xlsx',
                        skiprows = 4,
                        skipfooter = 10)

dfasx.columns = [x.lower() for x in dfasx.columns]

dictcolsasx = {'date' : 'date',
               'sofia overnight (%) vwap' : 'sofia_vwap_asx',
               'sofia overnight  (%) volume weighted median' : 'sofia_median_asx'}

dfasx = dfasx[dictcolsasx.keys()]
dfasx.columns = [dictcolsasx[x] for x in dictcolsasx.keys()]
In [5]:
# Make a single dataframe with both refrates 
dffin = pd.merge(dfcash,
                 dfasx,
                 on = 'date',
                 how = 'inner')
In [6]:
# Load the new RBA data
dfrba = pd.read_excel(pathdata + 'allSOFIAslonger.xlsx')
dfrba.columns = [x.lower().replace('sofia_','sofia_vwap_').replace('sofiamedian','sofia_median') for x in dfrba.columns]

rbavwapcols = [x for x in dfrba.columns if x.find('vwap')>-1]
rbamediancols = [x for x in dfrba.columns if x.find('median')>-1]

rbacols = ['date']
rbacols.extend(rbavwapcols[:])
rbacols.extend(rbamediancols)
    
dffin = pd.merge(dffin,
                 dfrba[rbacols],
                 on = 'date',
                 how = 'left')
In [7]:
# Load RBA-replicated SOFIA and check it all lines up
dfsofiarep = pd.read_excel(pathdata + 'sofia_replicate.xlsx')
dfsofiarep.columns = [x.lower() for x in dfsofiarep.columns]
dictcolsrba = {'sofiamedian' : 'sofia_median_rbachck', 
               'sofia' : 'sofia_vwap_rbachck'}
dfsofiarep.rename(columns = dictcolsrba, inplace = True)


dffin = pd.merge(dffin,
                 dfsofiarep[['date','sofia_vwap_rbachck','sofia_median_rbachck']],
                 on = 'date',
                 how = 'left')

plt.scatter(dffin.sofia_vwap_asx, dffin.sofia_vwap_rba)
Out[7]:
<matplotlib.collections.PathCollection at 0x1fc6a22fca0>
In [8]:
# Examine first five and last five rows
display(dffin.head())
display(dffin.tail())
date onr aonia sofia_vwap_asx sofia_median_asx sofia_vwap_rba sofia_vwap_0000 sofia_vwap_0040 sofia_vwap_2525 sofia_vwap_2525_2mn sofia_vwap_0525 sofia_vwap_0525_2mn sofia_vwap_relparty sofia_median_rba sofia_median_0000 sofia_median_0040 sofia_median_2525 sofia_median_2525_2mn sofia_median_0525 sofia_median_0525_2mn sofia_median_relparty sofia_vwap_rbachck sofia_median_rbachck
0 2022-01-04 0.1 0.04 0.015187 0.010 0.015187 -0.005466 0.016484 0.01 0.01 0.012037 0.012049 0.015187 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.015187 0.01
1 2022-01-05 0.1 0.04 0.016911 0.020 0.016911 0.003159 0.018639 0.01 0.01 0.014611 0.014627 0.016680 0.02 0.01 0.02 0.01 0.01 0.01 0.01 0.02 0.016911 0.02
2 2022-01-06 0.1 0.04 0.017234 0.010 0.017234 -0.020012 0.019043 0.01 0.01 0.013633 0.013633 0.015464 0.01 0.01 0.02 0.01 0.01 0.01 0.01 0.01 0.017234 0.01
3 2022-01-07 0.1 0.04 0.017647 0.020 0.017647 -0.009158 0.019559 0.01 0.01 0.013813 0.013817 0.016611 0.02 0.01 0.02 0.01 0.01 0.01 0.01 0.01 0.017647 0.02
4 2022-01-10 0.1 0.04 0.017760 0.015 0.017794 -0.004272 0.019743 0.01 0.01 0.015361 0.015368 0.017710 0.01 0.01 0.02 0.01 0.01 0.01 0.01 0.01 0.017794 0.01
date onr aonia sofia_vwap_asx sofia_median_asx sofia_vwap_rba sofia_vwap_0000 sofia_vwap_0040 sofia_vwap_2525 sofia_vwap_2525_2mn sofia_vwap_0525 sofia_vwap_0525_2mn sofia_vwap_relparty sofia_median_rba sofia_median_0000 sofia_median_0040 sofia_median_2525 sofia_median_2525_2mn sofia_median_0525 sofia_median_0525_2mn sofia_median_relparty sofia_vwap_rbachck sofia_median_rbachck
802 2025-03-13 4.1 4.09 4.134282 4.13 4.134282 4.109091 4.136029 4.129060 4.129064 4.129060 4.129064 4.135223 4.13 4.13 4.14 4.13 4.13 4.13 4.13 4.13 NaN NaN
803 2025-03-14 4.1 4.09 4.135620 4.14 4.135595 4.120111 4.136994 4.130000 4.130000 4.130000 4.130000 4.135913 4.14 4.13 4.14 4.13 4.13 4.13 4.13 4.14 NaN NaN
804 2025-03-17 4.1 4.09 4.136397 4.14 4.136411 4.127051 4.138014 4.130000 4.130000 4.130000 4.130000 4.136716 4.14 4.13 4.14 4.13 4.13 4.13 4.13 4.14 NaN NaN
805 2025-03-18 4.1 4.09 4.135447 4.14 4.135447 4.127629 4.136809 4.130000 4.130000 4.130000 4.130000 4.135597 4.14 4.13 4.14 4.13 4.13 4.13 4.13 4.14 NaN NaN
806 2025-03-19 4.1 4.09 4.134332 4.13 4.134332 4.119904 4.135703 4.129619 4.129626 4.129619 4.129626 4.134758 4.13 4.13 4.13 4.13 4.13 4.13 4.13 4.13 NaN NaN
In [9]:
# Examine the sum-stats for the data
display(dffin.describe().T)
count mean std min 25% 50% 75% max
onr 807.0 3.252416 1.484464 0.100000 2.600000 4.100000 4.350000 4.350000
aonia 807.0 3.223234 1.493535 0.040000 2.560000 4.070000 4.320000 4.340000
sofia_vwap_asx 807.0 3.230842 1.515500 0.006326 2.544225 4.092313 4.339667 4.404321
sofia_median_asx 807.0 3.229641 1.516166 0.010000 2.550000 4.100000 4.340000 4.400000
sofia_vwap_rba 803.0 3.231961 1.516501 0.006492 2.545580 4.092760 4.339666 4.404321
sofia_vwap_0000 803.0 3.208980 1.521034 -0.061389 2.524428 4.070573 4.325986 4.393291
sofia_vwap_0040 803.0 3.237651 1.516800 0.014591 2.552440 4.099065 4.343170 4.410237
sofia_vwap_2525 796.0 3.208022 1.519735 -0.005351 2.523751 4.074365 4.328685 4.384996
sofia_vwap_2525_2mn 796.0 3.208065 1.519744 -0.004947 2.523772 4.074373 4.328689 4.384994
sofia_vwap_0525 800.0 3.221637 1.518471 -0.002261 2.536070 4.086702 4.334538 4.393798
sofia_vwap_0525_2mn 800.0 3.221657 1.518460 -0.001895 2.537901 4.086702 4.334543 4.393807
sofia_vwap_relparty 799.0 3.228171 1.520115 0.001329 2.544416 4.094379 4.340921 4.406417
sofia_median_rba 803.0 3.230741 1.517160 0.010000 2.550000 4.100000 4.340000 4.400000
sofia_median_0000 803.0 3.222522 1.517355 0.010000 2.530000 4.080000 4.330000 4.400000
sofia_median_0040 803.0 3.235405 1.516847 0.010000 2.550000 4.100000 4.340000 4.400000
sofia_median_2525 796.0 3.209334 1.520262 0.000000 2.530000 4.070000 4.330000 4.390000
sofia_median_2525_2mn 796.0 3.209410 1.520306 0.000000 2.530000 4.075000 4.330000 4.390000
sofia_median_0525 800.0 3.222675 1.519564 0.010000 2.550000 4.090000 4.340000 4.400000
sofia_median_0525_2mn 800.0 3.222713 1.519562 0.010000 2.550000 4.090000 4.340000 4.400000
sofia_median_relparty 799.0 3.227372 1.520153 0.010000 2.550000 4.100000 4.340000 4.400000
sofia_vwap_rbachck 761.0 3.175133 1.537583 0.006492 2.527913 4.089460 4.339163 4.404321
sofia_median_rbachck 761.0 3.173922 1.538279 0.010000 2.530000 4.090000 4.340000 4.400000
In [10]:
# Dictionaries for labelling plots and lists to analyze

dictlabels = {'sofia_vwap_asx' : 'ASX',
              'sofia_vwap_rba' : 'RBA Rep.',
              }
In [11]:
# Choose which rates to estimate
refextract_all = ['sofia_vwap_asx',
                  'sofia_vwap_rba']
In [12]:
# Decide if to round
roundflag, roundsig = 1, 4
In [13]:
# Create data for MP state-space estimation
"""
For each reference rate, create a Tx1 vector of observed rates and estimate the 
univariate local level state-space model to the state variable and noise term for each
observation
"""
dictdata, dictcntrl, dictdf = {}, {}, {}
estlistall = []
    
for refextract in refextract_all:
    T = len(dffin)
    
    y = np.empty((T,1))
    
    if roundflag == 0:
        y[:,0] = dffin[refextract]
    elif roundflag == 1:
        y[:,0] = np.round(dffin[refextract], decimals = 2)
        
    
    x = dffin[['onr']].values.reshape((T,1))
        
    dictdata[refextract] = y
    dictcntrl[refextract] = x
    dictdf[refextract] = dffin.copy()
    
    x = dictcntrl[refextract]
    xonr = x[:,-1]
    onrvar = x[:,-1].var()
    y = dictdata[refextract]
    datesccy = dictdf[refextract].date
    
    estlistall.append(['full', refextract, y, xonr, onrvar, datesccy])
In [14]:
# Estimate each model in parallel using all but four cores 
from stsp_ll_mods import stspestmp_ll

mp_cores = mp.cpu_count() - 4
pool = mp.Pool(mp_cores)
if __name__ ==  '__main__': 
    ssbyperccy = pool.map(stspestmp_ll, estlistall)
pool.close()    
dfreginf = pd.concat(ssbyperccy)
In [15]:
# Plots and save
for refextract in refextract_all:
    dfres = dfreginf[dfreginf.alt == refextract].copy()
    dfplot = dfres.dfsignoise[0]
    
#     print(dfres['alt'].iloc[0])
#     display(dfplot.describe())

    fig = plt.figure(figsize = (7, 3))
    ax1 = fig.add_subplot(1,1,1)
    
    l1 = ax1.plot(dfplot.datefmt, 
                  dfplot.state_smooth, 'b-',
                  lw= 1, markersize = 3)       
        
    yearsFmt = DateFormatter('%b-%Y')
    years = YearLocator()
    months = MonthLocator()
    ax1.autoscale_view()
    ax1.xaxis.set_tick_params(labelbottom=True,
                             rotation = 90)
    ax1.xaxis.set_major_formatter(yearsFmt)
    ax1.xaxis.set_minor_locator(months)
    ax1.grid(True, linestyle = 'dashed')
    ax1.set_ylabel('Yield (%)')
    
    aonia_leg= mlines.Line2D([], [], color='blue', lw = 1,
                             linestyle = '-', label = 'Smoothed state ' + dictlabels[refextract])
    
    plt.legend(handles=[aonia_leg])
    
    print(dfres.ressum[0])

    dfplot.to_csv('Outputs/' + 'sign_noise_' + refextract + '_apr2025.csv',
                  index = False)
                           Statespace Model Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                  807
Model:                  llmod_wCntrls   Log Likelihood                2565.672
Date:                Tue, 17 Feb 2026   AIC                          -5123.343
Time:                        14:43:20   BIC                          -5104.570
Sample:                             0   HQIC                         -5116.134
                                - 807                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
state_var   8.931e-06    5.2e-07     17.170      0.000    7.91e-06    9.95e-06
obs_var     7.036e-05   1.29e-06     54.530      0.000    6.78e-05    7.29e-05
betavar0_1     1.0028      0.006    174.801      0.000       0.992       1.014
===================================================================================
Ljung-Box (L1) (Q):                  12.65   Jarque-Bera (JB):             42447.56
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.06   Skew:                            -0.16
Prob(H) (two-sided):                  0.66   Kurtosis:                        38.55
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
                           Statespace Model Results                           
==============================================================================
Dep. Variable:                      y   No. Observations:                  807
Model:                  llmod_wCntrls   Log Likelihood                2540.345
Date:                Tue, 17 Feb 2026   AIC                          -5072.690
Time:                        14:43:20   BIC                          -5053.916
Sample:                             0   HQIC                         -5065.481
                                - 807                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
state_var    9.77e-06   5.15e-07     18.985      0.000    8.76e-06    1.08e-05
obs_var     7.162e-05    1.4e-06     51.040      0.000    6.89e-05    7.44e-05
betavar0_1     1.0014      0.006    169.629      0.000       0.990       1.013
===================================================================================
Ljung-Box (L1) (Q):                  11.13   Jarque-Bera (JB):             42362.65
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               0.97   Skew:                            -0.03
Prob(H) (two-sided):                  0.81   Kurtosis:                        38.52
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [ ]: